{
 "cells": [
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:32:49.268657Z",
     "start_time": "2025-09-06T05:32:48.247150Z"
    }
   },
   "source": "from datasets import *",
   "outputs": [],
   "execution_count": 4
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# datasets 基本使用"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载在线数据集"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:00.261638Z",
     "start_time": "2025-09-06T05:32:53.053355Z"
    }
   },
   "source": [
    "datasets = load_dataset(\"madao33/new-title-chinese\")\n",
    "datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['title', 'content'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['title', 'content'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 5
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": ""
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载数据集合集中的某一项任务"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:04.718418Z",
     "start_time": "2025-09-06T05:33:04.663533Z"
    }
   },
   "source": [
    "boolq_dataset = load_dataset(\"super_glue\", \"boolq\")\n",
    "boolq_dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['question', 'passage', 'idx', 'label'],\n",
       "        num_rows: 9427\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['question', 'passage', 'idx', 'label'],\n",
       "        num_rows: 3270\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['question', 'passage', 'idx', 'label'],\n",
       "        num_rows: 3245\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 6
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 按照数据集划分进行加载"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:14.229827Z",
     "start_time": "2025-09-06T05:33:10.468042Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"madao33/new-title-chinese\", split = \"train\")\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['title', 'content'],\n",
       "    num_rows: 5850\n",
       "})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 7
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:22.321565Z",
     "start_time": "2025-09-06T05:33:16.177131Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"madao33/new-title-chinese\", split = \"train[10:100]\")\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['title', 'content'],\n",
       "    num_rows: 90\n",
       "})"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 8
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:29.285801Z",
     "start_time": "2025-09-06T05:33:24.377600Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"madao33/new-title-chinese\", split = \"train[:50%]\")\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['title', 'content'],\n",
       "    num_rows: 2925\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:35.151499Z",
     "start_time": "2025-09-06T05:33:31.605449Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"madao33/new-title-chinese\", split = [\"train[:50%]\", \"train[50%:]\"])\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Dataset({\n",
       "     features: ['title', 'content'],\n",
       "     num_rows: 2925\n",
       " }),\n",
       " Dataset({\n",
       "     features: ['title', 'content'],\n",
       "     num_rows: 2925\n",
       " })]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 10
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看数据集"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:43.212989Z",
     "start_time": "2025-09-06T05:33:39.260766Z"
    }
   },
   "source": [
    "datasets = load_dataset(\"madao33/new-title-chinese\")\n",
    "datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['title', 'content'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['title', 'content'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 11
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:46.784644Z",
     "start_time": "2025-09-06T05:33:46.777506Z"
    }
   },
   "source": [
    "datasets[\"train\"][0]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'title': '望海楼美国打“台湾牌”是危险的赌博',\n",
       " 'content': '近期，美国国会众院通过法案，重申美国对台湾的承诺。对此，中国外交部发言人表示，有关法案严重违反一个中国原则和中美三个联合公报规定，粗暴干涉中国内政，中方对此坚决反对并已向美方提出严正交涉。\\n事实上，中国高度关注美国国内打“台湾牌”、挑战一中原则的危险动向。近年来，作为“亲台”势力大本营的美国国会动作不断，先后通过“与台湾交往法”“亚洲再保证倡议法”等一系列“挺台”法案，“2019财年国防授权法案”也多处触及台湾问题。今年3月，美参院亲台议员再抛“台湾保证法”草案。众院议员继而在4月提出众院版的草案并在近期通过。上述法案的核心目标是强化美台关系，并将台作为美“印太战略”的重要伙伴。同时，“亲台”议员还有意制造事端。今年2月，5名共和党参议员致信众议院议长，促其邀请台湾地区领导人在国会上发表讲话。这一动议显然有悖于美国与台湾的非官方关系，其用心是实质性改变美台关系定位。\\n上述动向出现并非偶然。在中美建交40周年之际，两国关系摩擦加剧，所谓“中国威胁论”再次沉渣泛起。美国对华认知出现严重偏差，对华政策中负面因素上升，保守人士甚至成立了“当前中国威胁委员会”。在此背景下，美国将台海关系作为战略抓手，通过打“台湾牌”在双边关系中增加筹码。特朗普就任后，国会对总统外交政策的约束力和塑造力加强。其实国会推动通过涉台法案对行政部门不具约束力，美政府在2018年并未提升美台官员互访级别，美军舰也没有“访问”台湾港口，保持着某种克制。但从美总统签署国会通过的法案可以看出，国会对外交产生了影响。立法也为政府对台政策提供更大空间。\\n然而，美国需要认真衡量打“台湾牌”成本。首先是美国应对危机的代价。美方官员和学者已明确发出警告，美国卷入台湾问题得不偿失。美国学者曾在媒体发文指出，如果台海爆发危机，美国可能需要“援助”台湾，进而导致新的冷战乃至与中国大陆的冲突。但如果美国让台湾自己面对，则有损美国的信誉，影响美盟友对同盟关系的支持。其次是对中美关系的危害。历史证明，中美合则两利、斗则两伤。中美关系是当今世界最重要的双边关系之一，保持中美关系的稳定发展，不仅符合两国和两国人民的根本利益，也是国际社会的普遍期待。美国蓄意挑战台湾问题的底线，加剧中美关系的复杂性和不确定性，损害两国在重要领域合作，损人又害己。\\n美国打“台湾牌”是一场危险的赌博。台湾问题是中国核心利益，中国政府和人民决不会对此坐视不理。中国敦促美方恪守一个中国原则和中美三个联合公报规定，阻止美国会审议推进有关法案，妥善处理涉台问题。美国悬崖勒马，才是明智之举。\\n（作者系中国国际问题研究院国际战略研究所副所长）'}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 12
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:33:56.552866Z",
     "start_time": "2025-09-06T05:33:56.547867Z"
    }
   },
   "source": [
    "datasets[\"train\"][:2]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'title': ['望海楼美国打“台湾牌”是危险的赌博', '大力推进高校治理能力建设'],\n",
       " 'content': ['近期，美国国会众院通过法案，重申美国对台湾的承诺。对此，中国外交部发言人表示，有关法案严重违反一个中国原则和中美三个联合公报规定，粗暴干涉中国内政，中方对此坚决反对并已向美方提出严正交涉。\\n事实上，中国高度关注美国国内打“台湾牌”、挑战一中原则的危险动向。近年来，作为“亲台”势力大本营的美国国会动作不断，先后通过“与台湾交往法”“亚洲再保证倡议法”等一系列“挺台”法案，“2019财年国防授权法案”也多处触及台湾问题。今年3月，美参院亲台议员再抛“台湾保证法”草案。众院议员继而在4月提出众院版的草案并在近期通过。上述法案的核心目标是强化美台关系，并将台作为美“印太战略”的重要伙伴。同时，“亲台”议员还有意制造事端。今年2月，5名共和党参议员致信众议院议长，促其邀请台湾地区领导人在国会上发表讲话。这一动议显然有悖于美国与台湾的非官方关系，其用心是实质性改变美台关系定位。\\n上述动向出现并非偶然。在中美建交40周年之际，两国关系摩擦加剧，所谓“中国威胁论”再次沉渣泛起。美国对华认知出现严重偏差，对华政策中负面因素上升，保守人士甚至成立了“当前中国威胁委员会”。在此背景下，美国将台海关系作为战略抓手，通过打“台湾牌”在双边关系中增加筹码。特朗普就任后，国会对总统外交政策的约束力和塑造力加强。其实国会推动通过涉台法案对行政部门不具约束力，美政府在2018年并未提升美台官员互访级别，美军舰也没有“访问”台湾港口，保持着某种克制。但从美总统签署国会通过的法案可以看出，国会对外交产生了影响。立法也为政府对台政策提供更大空间。\\n然而，美国需要认真衡量打“台湾牌”成本。首先是美国应对危机的代价。美方官员和学者已明确发出警告，美国卷入台湾问题得不偿失。美国学者曾在媒体发文指出，如果台海爆发危机，美国可能需要“援助”台湾，进而导致新的冷战乃至与中国大陆的冲突。但如果美国让台湾自己面对，则有损美国的信誉，影响美盟友对同盟关系的支持。其次是对中美关系的危害。历史证明，中美合则两利、斗则两伤。中美关系是当今世界最重要的双边关系之一，保持中美关系的稳定发展，不仅符合两国和两国人民的根本利益，也是国际社会的普遍期待。美国蓄意挑战台湾问题的底线，加剧中美关系的复杂性和不确定性，损害两国在重要领域合作，损人又害己。\\n美国打“台湾牌”是一场危险的赌博。台湾问题是中国核心利益，中国政府和人民决不会对此坐视不理。中国敦促美方恪守一个中国原则和中美三个联合公报规定，阻止美国会审议推进有关法案，妥善处理涉台问题。美国悬崖勒马，才是明智之举。\\n（作者系中国国际问题研究院国际战略研究所副所长）',\n",
       "  '在推进“双一流”高校建设进程中，我们要紧紧围绕为党育人、为国育才，找准问题、破解难题，以一流意识和担当精神，大力推进高校的治理能力建设。\\n增强政治引领力。坚持党对高校工作的全面领导，始终把政治建设摆在首位，增强校党委的政治领导力，全面推进党的建设各项工作。落实立德树人根本任务，把培养社会主义建设者和接班人放在中心位置。紧紧抓住思想政治工作这条生命线，全面加强师生思想政治工作，推进“三全育人”综合改革，将思想政治工作贯穿学校教育管理服务全过程，努力让学生成为德才兼备、全面发展的人才。\\n提升人才聚集力。人才是创新的核心要素，创新驱动本质上是人才驱动。要坚持引育并举，建立绿色通道，探索知名专家举荐制，完善“一事一议”支持机制。在大力支持自然科学人才队伍建设的同时，实施哲学社会科学人才工程。立足实际，在条件成熟的学院探索“一院一策”改革。创新科研组织形式，为人才成长创设空间，建设更加崇尚学术、更加追求卓越、更加关爱学生、更加担当有为的学术共同体。\\n培养学生竞争力。遵循学生成长成才的规律培育人才，着力培养具有国际竞争力的拔尖创新人才和各类专门人才，使优势学科、优秀教师、优质资源、优良环境围绕立德树人的根本任务配置。淘汰“水课”，打造“金课”，全力打造世界一流本科教育。深入推进研究生教育综合改革，加强事关国家重大战略的高精尖急缺人才培养，建设具有国际竞争力的研究生教育。\\n激发科技创新力。在国家急需发展的领域挑大梁，就要更加聚焦科技前沿和国家需求，狠抓平台建设，包括加快牵头“武汉光源”建设步伐，积极参与国家实验室建设，建立校级大型科研仪器设备共享平台。关键核心技术领域“卡脖子”问题，归根结底是基础科学研究薄弱。要加大基础研究的支持力度，推进理论、技术和方法创新，鼓励支持重大原创和颠覆性技术创新，催生一批高水平、原创性研究成果。\\n发展社会服务力。在贡献和服务中体现价值，推动合作共建、多元投入的格局，大力推进政产学研用结合，强化科技成果转移转化及产业化。探索校城融合发展、校地联动发展的新模式，深度融入地方创新发展网络，为地方经济社会发展提供人才支撑，不断拓展和优化社会服务网络。\\n涵育文化软实力。加快体制机制改革，优化学校、学部、学院三级评审机制，充分发挥优秀学者特别是德才兼备的年轻学者在学术治理中的重要作用。牢固树立一流意识、紧紧围绕一流目标、认真执行一流标准，让成就一流事业成为普遍追求和行动自觉。培育具有强大凝聚力的大学文化，营造积极团结、向上向善、干事创业的氛围，让大学成为吸引和留住一大批优秀人才建功立业的沃土，让敢干事、肯干事、能干事的人有更多的荣誉感和获得感。\\n建设中国特色、世界一流大学不是等得来、喊得来的，而是脚踏实地拼出来、干出来的。对标一流，深化改革，坚持按章程办学，构建以一流质量标准为核心的制度规范体系，扎实推进学校综合改革，探索更具活力、更富效率的管理体制和运行机制，我们就一定能构建起具有中国特色的现代大学治理体系，进一步提升管理服务水平和工作效能。\\n（作者系武汉大学校长）']}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 13
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:12.934548Z",
     "start_time": "2025-09-06T05:34:12.926112Z"
    }
   },
   "source": [
    "datasets[\"train\"][\"title\"][:5]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['望海楼美国打“台湾牌”是危险的赌博',\n",
       " '大力推进高校治理能力建设',\n",
       " '坚持事业为上选贤任能',\n",
       " '“大朋友”的话儿记心头',\n",
       " '用好可持续发展这把“金钥匙”']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 15
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:20.540028Z",
     "start_time": "2025-09-06T05:34:20.535905Z"
    }
   },
   "source": [
    "datasets[\"train\"].column_names"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['title', 'content']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 16
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:22.192743Z",
     "start_time": "2025-09-06T05:34:22.187743Z"
    }
   },
   "source": [
    "datasets[\"train\"].features"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'title': Value(dtype='string', id=None),\n",
       " 'content': Value(dtype='string', id=None)}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 17
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据集划分"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:25.363087Z",
     "start_time": "2025-09-06T05:34:25.341624Z"
    }
   },
   "source": [
    "dataset = datasets[\"train\"]\n",
    "dataset.train_test_split(test_size = 0.1)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['title', 'content'],\n",
       "        num_rows: 5265\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['title', 'content'],\n",
       "        num_rows: 585\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 18
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "dataset = boolq_dataset[\"train\"]\n",
    "dataset.train_test_split(test_size = 0.1, stratify_by_column = \"label\")  # 分类数据集可以按照比例划分"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据选取与过滤"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:29.509123Z",
     "start_time": "2025-09-06T05:34:29.504123Z"
    }
   },
   "source": [
    "# 选取\n",
    "datasets[\"train\"].select([0, 1])"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['title', 'content'],\n",
       "    num_rows: 2\n",
       "})"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 19
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:38.612008Z",
     "start_time": "2025-09-06T05:34:38.605996Z"
    }
   },
   "source": [
    "# 过滤\n",
    "filter_dataset = datasets[\"train\"].filter(lambda example: \"中国\" in example[\"title\"])"
   ],
   "outputs": [],
   "execution_count": 20
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:45.706710Z",
     "start_time": "2025-09-06T05:34:45.698442Z"
    }
   },
   "source": [
    "filter_dataset[\"title\"][:5]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['聚焦两会，世界探寻中国成功秘诀',\n",
       " '望海楼中国经济的信心来自哪里',\n",
       " '“中国奇迹”助力世界减贫跑出加速度',\n",
       " '和音瞩目历史交汇点上的中国',\n",
       " '中国风采感染世界']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 21
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据映射"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:34:58.050143Z",
     "start_time": "2025-09-06T05:34:58.047111Z"
    }
   },
   "source": [
    "def add_prefix(example):\n",
    "\texample[\"title\"] = 'Prefix: ' + example[\"title\"]\n",
    "\treturn example"
   ],
   "outputs": [],
   "execution_count": 22
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:00.612601Z",
     "start_time": "2025-09-06T05:35:00.594090Z"
    }
   },
   "source": [
    "prefix_dataset = datasets.map(add_prefix)\n",
    "prefix_dataset[\"train\"][:10][\"title\"]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Prefix: 望海楼美国打“台湾牌”是危险的赌博',\n",
       " 'Prefix: 大力推进高校治理能力建设',\n",
       " 'Prefix: 坚持事业为上选贤任能',\n",
       " 'Prefix: “大朋友”的话儿记心头',\n",
       " 'Prefix: 用好可持续发展这把“金钥匙”',\n",
       " 'Prefix: 跨越雄关，我们走在大路上',\n",
       " 'Prefix: 脱贫奇迹彰显政治优势',\n",
       " 'Prefix: 拱卫亿万人共同的绿色梦想',\n",
       " 'Prefix: 为党育人、为国育才',\n",
       " 'Prefix: 净化网络语言']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 23
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:06.214132Z",
     "start_time": "2025-09-06T05:35:03.734716Z"
    }
   },
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-chinese\")\n",
    "\n",
    "\n",
    "def preprocess_function(example, tokenizer = tokenizer):\n",
    "\tmodel_inputs = tokenizer(example[\"content\"], max_length = 512, truncation = True)\n",
    "\tlabels = tokenizer(example[\"title\"], max_length = 32, truncation = True)\n",
    "\t# label就是title编码的结果\n",
    "\tmodel_inputs[\"labels\"] = labels[\"input_ids\"]\n",
    "\treturn model_inputs"
   ],
   "outputs": [],
   "execution_count": 24
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:09.686387Z",
     "start_time": "2025-09-06T05:35:09.659034Z"
    }
   },
   "source": [
    "processed_datasets = datasets.map(preprocess_function)\n",
    "processed_datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 25
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:13.640868Z",
     "start_time": "2025-09-06T05:35:13.595405Z"
    }
   },
   "source": [
    "processed_datasets = datasets.map(preprocess_function, num_proc = 4)\n",
    "processed_datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 26
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:17.307079Z",
     "start_time": "2025-09-06T05:35:17.280052Z"
    }
   },
   "source": [
    "processed_datasets = datasets.map(preprocess_function, batched = True)\n",
    "processed_datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['title', 'content', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 27
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:23.104448Z",
     "start_time": "2025-09-06T05:35:20.534768Z"
    }
   },
   "source": [
    "processed_datasets = datasets.map(preprocess_function, batched = True, remove_columns = datasets[\"train\"].column_names)\n",
    "processed_datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Map:   0%|          | 0/1679 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "40802c0addff47a68f2856281758bfbc"
      }
     },
     "metadata": {},
     "output_type": "display_data",
     "jetTransient": {
      "display_id": null
     }
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 28
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存与加载"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:44.339978Z",
     "start_time": "2025-09-06T05:35:44.290205Z"
    }
   },
   "source": "processed_datasets.save_to_disk(\"./processed_data\")",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/5850 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "c56e1ce0ee25486ca63d84795d420f4a"
      }
     },
     "metadata": {},
     "output_type": "display_data",
     "jetTransient": {
      "display_id": null
     }
    },
    {
     "data": {
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/1679 [00:00<?, ? examples/s]"
      ],
      "application/vnd.jupyter.widget-view+json": {
       "version_major": 2,
       "version_minor": 0,
       "model_id": "2ae3a935158b40ada9c568bf090baeb0"
      }
     },
     "metadata": {},
     "output_type": "display_data",
     "jetTransient": {
      "display_id": null
     }
    }
   ],
   "execution_count": 29
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:46.452154Z",
     "start_time": "2025-09-06T05:35:46.435152Z"
    }
   },
   "source": [
    "processed_datasets = load_from_disk(\"./processed_data\")\n",
    "processed_datasets"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 5850\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "        num_rows: 1679\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 30
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 加载本地数据集"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 直接加载文件作为数据集"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:50.530459Z",
     "start_time": "2025-09-06T05:35:49.767147Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"csv\", data_files = \"./ChnSentiCorp_htl_all.csv\", split = \"train\")\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 7766\n",
       "})"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 31
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:53.226845Z",
     "start_time": "2025-09-06T05:35:53.206871Z"
    }
   },
   "source": [
    "dataset = Dataset.from_csv(\"./ChnSentiCorp_htl_all.csv\")\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 7766\n",
       "})"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 32
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载文件夹内全部文件作为数据集"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:35:57.696914Z",
     "start_time": "2025-09-06T05:35:56.897964Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"csv\",\n",
    "\tdata_files = [\"./all_data/ChnSentiCorp_htl_all.csv\", \"./all_data/ChnSentiCorp_htl_all copy.csv\"], split = 'train')\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 15532\n",
       "})"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 33
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 通过预先加载的其他格式转换加载数据集"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:00.848418Z",
     "start_time": "2025-09-06T05:36:00.804556Z"
    }
   },
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "data = pd.read_csv(\"./ChnSentiCorp_htl_all.csv\")\n",
    "data.head()"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "   label                                             review\n",
       "0      1  距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n",
       "1      1                       商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!\n",
       "2      1         早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n",
       "3      1  宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...\n",
       "4      1               CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 34
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:05.541088Z",
     "start_time": "2025-09-06T05:36:05.517576Z"
    }
   },
   "source": [
    "dataset = Dataset.from_pandas(data)\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 7766\n",
       "})"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 35
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:07.570649Z",
     "start_time": "2025-09-06T05:36:07.564333Z"
    }
   },
   "source": [
    "# List格式的数据需要内嵌{}，明确数据字段\n",
    "data = [{\"text\": \"abc\"}, {\"text\": \"def\"}]\n",
    "# data = [\"abc\", \"def\"]\n",
    "Dataset.from_list(data)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text'],\n",
       "    num_rows: 2\n",
       "})"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 36
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 通过自定义加载脚本加载数据集"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:25.884668Z",
     "start_time": "2025-09-06T05:36:25.057030Z"
    }
   },
   "source": "load_dataset(\"json\", data_files = \"./cmrc2018_trial.json\", field = \"data\")",
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['paragraphs', 'id', 'title'],\n",
       "        num_rows: 256\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 37
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:28.344511Z",
     "start_time": "2025-09-06T05:36:28.320628Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"./load_script.py\", split = \"train\", trust_remote_code=True)\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['id', 'context', 'question', 'answers'],\n",
       "    num_rows: 1002\n",
       "})"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 38
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:30.461398Z",
     "start_time": "2025-09-06T05:36:30.455504Z"
    }
   },
   "source": [
    "dataset[0]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'id': 'TRIAL_800_QUERY_0',\n",
       " 'context': '基于《跑跑卡丁车》与《泡泡堂》上所开发的游戏，由韩国Nexon开发与发行。中国大陆由盛大游戏运营，这是Nexon时隔6年再次授予盛大网络其游戏运营权。台湾由游戏橘子运营。玩家以水枪、小枪、锤子或是水炸弹泡封敌人(玩家或NPC)，即为一泡封，将水泡击破为一踢爆。若水泡未在时间内踢爆，则会从水泡中释放或被队友救援(即为一救援)。每次泡封会减少生命数，生命数耗完即算为踢爆。重生者在一定时间内为无敌状态，以踢爆数计分较多者获胜，规则因模式而有差异。以2V2、4V4随机配对的方式，玩家可依胜场数爬牌位(依序为原石、铜牌、银牌、金牌、白金、钻石、大师) ，可选择经典、热血、狙击等模式进行游戏。若游戏中离，则4分钟内不得进行配对(每次中离+4分钟)。开放时间为暑假或寒假期间内不定期开放，8人经典模式随机配对，采计分方式，活动时间内分数越多，终了时可依该名次获得奖励。',\n",
       " 'question': '生命数耗完即算为什么？',\n",
       " 'answers': {'text': ['踢爆'], 'answer_start': [127]}}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 39
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset with DataCollator"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:34.499808Z",
     "start_time": "2025-09-06T05:36:34.025866Z"
    }
   },
   "source": "from transformers import DataCollatorWithPadding",
   "outputs": [],
   "execution_count": 40
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:36.724937Z",
     "start_time": "2025-09-06T05:36:35.959889Z"
    }
   },
   "source": [
    "dataset = load_dataset(\"csv\", data_files = \"./ChnSentiCorp_htl_all.csv\", split = 'train')\n",
    "dataset = dataset.filter(lambda x: x[\"review\"] is not None)\n",
    "dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label', 'review'],\n",
       "    num_rows: 7765\n",
       "})"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 41
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:38.815005Z",
     "start_time": "2025-09-06T05:36:38.811794Z"
    }
   },
   "source": [
    "def process_function(examples):\n",
    "\ttokenized_examples = tokenizer(examples[\"review\"], max_length = 128, truncation = True)\n",
    "\ttokenized_examples[\"labels\"] = examples[\"label\"]\n",
    "\treturn tokenized_examples"
   ],
   "outputs": [],
   "execution_count": 42
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:40.467331Z",
     "start_time": "2025-09-06T05:36:40.454163Z"
    }
   },
   "source": [
    "tokenized_dataset = dataset.map(process_function, batched = True, remove_columns = dataset.column_names)\n",
    "tokenized_dataset"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],\n",
       "    num_rows: 7765\n",
       "})"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 43
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:42.919476Z",
     "start_time": "2025-09-06T05:36:42.915475Z"
    }
   },
   "source": [
    "print(tokenized_dataset[:3])"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': [[101, 6655, 4895, 2335, 3763, 1062, 6662, 6772, 6818, 117, 852, 3221, 1062, 769, 2900, 4850, 679, 2190, 117, 1963, 3362, 3221, 107, 5918, 7355, 5296, 107, 4638, 6413, 117, 833, 7478, 2382, 7937, 4172, 119, 2456, 6379, 4500, 1166, 4638, 6662, 5296, 119, 2791, 7313, 6772, 711, 5042, 1296, 119, 102], [101, 1555, 1218, 1920, 2414, 2791, 8024, 2791, 7313, 2523, 1920, 8024, 2414, 3300, 100, 2160, 8024, 3146, 860, 2697, 6230, 5307, 3845, 2141, 2669, 679, 7231, 106, 102], [101, 3193, 7623, 1922, 2345, 8024, 3187, 6389, 1343, 1914, 2208, 782, 8024, 6929, 6804, 738, 679, 1217, 7608, 1501, 4638, 511, 6983, 2421, 2418, 6421, 7028, 6228, 671, 678, 6821, 702, 7309, 7579, 749, 511, 2791, 7313, 3315, 6716, 2523, 1962, 511, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [1, 1, 1]}\n"
     ]
    }
   ],
   "execution_count": 44
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:44.942243Z",
     "start_time": "2025-09-06T05:36:44.938244Z"
    }
   },
   "source": "collator = DataCollatorWithPadding(tokenizer = tokenizer)",
   "outputs": [],
   "execution_count": 45
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:46.811577Z",
     "start_time": "2025-09-06T05:36:46.808544Z"
    }
   },
   "source": [
    "from torch.utils.data import DataLoader"
   ],
   "outputs": [],
   "execution_count": 46
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:48.177362Z",
     "start_time": "2025-09-06T05:36:48.173427Z"
    }
   },
   "source": "dl = DataLoader(tokenized_dataset, batch_size = 4, collate_fn = collator, shuffle = True)",
   "outputs": [],
   "execution_count": 47
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-09-06T05:36:49.822038Z",
     "start_time": "2025-09-06T05:36:49.809825Z"
    }
   },
   "source": [
    "num = 0\n",
    "for batch in dl:\n",
    "\tprint(batch[\"input_ids\"].size())\n",
    "\tnum += 1\n",
    "\tif num > 10:\n",
    "\t\tbreak"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([4, 128])\n",
      "torch.Size([4, 107])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 127])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 128])\n",
      "torch.Size([4, 128])\n"
     ]
    }
   ],
   "execution_count": 48
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "transformers",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
